Emiley Garcia-Zych
#install necessary packages
##install.packages("curl")
##install.packages("car")
library(curl)
## Using libcurl 8.1.2 with LibreSSL/3.3.6
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/zombies.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
m <- lm(data = d, height ~ weight)
SSY <- sum((m$model$height - mean(m$model$height))^2) # height - mean(height)
SSY
## [1] 18558.61
SSR <- sum((m$fitted.values - mean(m$model$height))^2) # predicted height - mean height
SSR
## [1] 12864.82
SSE <- sum((m$model$height - m$fitted.values)^2) # height - predicted height
SSE
## [1] 5693.785
df_regression <- 1
df_error <- 998
df_y <- 999
MSR <- SSR/df_regression
MSE <- SSE/df_error
MSY <- SSY/df_y
fratio <- MSR/MSE
fratio
## [1] 2254.931
curve(df(x, df = 1, df2 = 1), col = "green", lty = 3, lwd = 2, xlim = c(0, 10),
main = "Some Example F Distributions\n(vertical line shows critical value for df1=1,df2=998)",
ylab = "f(x)", xlab = "x")
curve(df(x, df = 2, df2 = 2), col = "blue", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 4, df2 = 4), col = "red", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 8, df2 = 6), col = "purple", lty = 3, lwd = 2, add = TRUE)
curve(df(x, df = 1, df2 = 998), col = "black", lwd = 3, add = TRUE)
legend("top", c("df1=1,df2=1", "df1=2,df2=2", "df1=4,df2=4", "df1=8,df2=6",
"df1=1,df2=998"), lty = 3, lwd = 2, col = c("green", "blue", "red", "purple",
"black"), bty = "n", cex = 0.75)
fcrit <- qf(p = 0.95, df1 = 1, df2 = 998)
fcrit
## [1] 3.850793
a <- aov(data = d, height ~ weight)
summary(a)
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 12865 12865 2255 <2e-16 ***
## Residuals 998 5694 6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.aov(m)
## Df Sum Sq Mean Sq F value Pr(>F)
## weight 1 12865 12865 2255 <2e-16 ***
## Residuals 998 5694 6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rsquared <- SSR/SSY
rsquared
## [1] 0.6931998
rho <- sqrt(rsquared)
rho
## [1] 0.8325862
SSX <- sum((m$model$weight - mean(m$model$weight))^2)
SEbeta1 <- sqrt(MSE/SSX)
SEbeta1
## [1] 0.004106858
SEbeta0 <- sqrt((MSE * sum(m$model$weight^2))/(1000 * SSX))
SEbeta0
## [1] 0.5958147
SEyhat <- sqrt(MSE * (1/1000 + (m$model$weight - mean(m$model$weight))^2/SSX))
head(SEyhat) # just the first 6 rows
## [1] 0.08978724 0.07620966 0.08414480 0.09533986 0.08904151 0.08341218
summary(m)
##
## Call:
## lm(formula = height ~ weight, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1519 -1.5206 -0.0535 1.5167 9.4439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.565446 0.595815 66.41 <2e-16 ***
## weight 0.195019 0.004107 47.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.389 on 998 degrees of freedom
## Multiple R-squared: 0.6932, Adjusted R-squared: 0.6929
## F-statistic: 2255 on 1 and 998 DF, p-value: < 2.2e-16
Calculate the residuals from the regression of zombie height on weight and plot these in relation to weight (the x variable). There are lots of ways to do this quickly.
m <- lm(data = d, height ~ weight)
plot(x = d$weight, y = m$residuals)
# or
e <- resid(m)
plot(x = d$weight, y = e)
hist(e, xlim = c(-4 * sd(e), 4 * sd(e)), breaks = 20, main = "Histogram of Residuals")
plot(m$model$weight, m$residuals)
par(mfrow = c(2, 2))
qqnorm(m$residuals)
library(car)
## Loading required package: carData
qqPlot(m$residuals)
## [1] 954 799
s <- shapiro.test(m$residuals)
s
##
## Shapiro-Wilk normality test
##
## data: m$residuals
## W = 0.99713, p-value = 0.07041
Load in the "KamilarAndCooper.csv" dataset and develop a linear model to look at the relationship between "weaning age" and "female body mass". You will probably need to look at the data and variable names again to find the appropriate variables to examine.
Using the procedures outlined above and in Module 12, calculate
estimates of β0 and β1 by hand *and using
the lm() function. Are the regression coefficients
estimated under a simple linear model statistically significantly
different from zero?
Construct an ANOVA table by hand and compare your values to the
results of running lm()and then looking
at summary.aov(lm()).
Generate the residuals for your linear model by hand, plot them in relation to female body weight, and make a histogram of the residuals. Do they appear to be normally distributed?
Run the plot() command on the result
of lm() and examine the 4 plots produced. Again, based on
examination of the residuals and the results of Shapiro-Wilks test, does
it look like your model has good fit?
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN588_Fall23/KamilarAndCooperData.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = FALSE)
head(d)
## Scientific_Name Family Genus Species
## 1 Allenopithecus_nigroviridis Cercopithecidae Allenopithecus nigroviridis
## 2 Allocebus_trichotis Cercopithecidae Allocebus trichotis
## 3 Alouatta_belzebul Atelidae Alouatta belzebul
## 4 Alouatta_caraya Atelidae Alouatta caraya
## 5 Alouatta_guariba Atelidae Alouatta guariba
## 6 Alouatta_palliata Atelidae Alouatta palliata
## Brain_Size_Species_Mean Brain_Size_Female_Mean Brain_size_Ref
## 1 58.02 53.70 Isler et al 2008
## 2 NA NA
## 3 52.84 51.19 Isler et al 2008
## 4 52.63 47.80 Isler et al 2008
## 5 51.70 49.08 Isler et al 2008
## 6 49.88 48.04 Isler et al 2008
## Body_mass_male_mean Body_mass_female_mean Mass_Dimorphism
## 1 6130 3180 1.928
## 2 92 84 1.095
## 3 7270 5520 1.317
## 4 6525 4240 1.539
## 5 5800 4550 1.275
## 6 7150 5350 1.336
## Mass_Ref MeanGroupSize AdultMales AdultFemale AdultSexRatio
## 1 Isler et al 2008 NA NA NA NA
## 2 Smith and Jungers 1997 1.00 1.00 1.0 NA
## 3 Isler et al 2008 7.00 1.00 1.0 1.00
## 4 Isler et al 2008 8.00 2.30 3.3 1.43
## 5 Isler et al 2008 6.53 1.37 2.2 1.61
## 6 Isler et al 2008 12.00 2.90 6.3 2.17
## Social_Organization_Ref
## 1
## 2 Kappeler 1997
## 3 Campbell et al 2007
## 4 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## 5 Campbell et al 2007
## 6 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## InterbirthInterval_d Gestation WeaningAge_d MaxLongevity_m LitterSz
## 1 NA NA 106.15 276.0 1.01
## 2 NA NA NA NA 1.00
## 3 NA NA NA NA NA
## 4 337.62 187 323.16 243.6 1.01
## 5 NA NA NA NA NA
## 6 684.37 186 495.60 300.0 1.02
## Life_History_Ref GR_MidRangeLat_dd Precip_Mean_mm Temp_Mean_degC AET_Mean_mm
## 1 Jones et al. 2009 -0.17 1574.0 25.2 1517.8
## 2 -16.59 1902.3 20.3 1388.2
## 3 -6.80 1643.5 24.9 1286.6
## 4 Jones et al. 2009 -20.34 1166.4 22.9 1193.1
## 5 -21.13 1332.3 19.6 1225.7
## 6 Jones et al. 2009 6.95 1852.6 23.7 1300.0
## PET_Mean_mm Climate_Ref HomeRange_km2 HomeRangeRef DayLength_km
## 1 1589.4 Jones et al. 2009 NA NA
## 2 1653.7 Jones et al. 2009 NA NA
## 3 1549.8 Jones et al. 2009 NA NA
## 4 1404.9 Jones et al. 2009 NA 0.40
## 5 1332.2 Jones et al. 2009 0.03 Jones et al. 2009 NA
## 6 1633.9 Jones et al. 2009 0.19 Jones et al. 2009 0.32
## DayLengthRef Territoriality Fruit Leaves Fauna DietRef1
## 1 NA NA
## 2 NA NA
## 3 NA 57.3 19.1 0.0 Campbell et al. 2007
## 4 Nunn et al. 2003 NA 23.8 67.7 0.0 Campbell et al. 2007
## 5 NA 5.2 73.0 0.0 Campbell et al. 2007
## 6 Nunn et al. 2003 0.6506 33.1 56.4 0.0 Campbell et al. 2007
## Canine_Dimorphism Canine_Dimorphism_Ref Feed Move Rest Social
## 1 2.210 Plavcan & Ruff 2008 NA NA NA NA
## 2 NA NA NA NA NA
## 3 1.811 Plavcan & Ruff 2008 13.75 18.75 57.30 10.00
## 4 1.542 Plavcan & Ruff 2008 15.90 17.60 61.60 4.90
## 5 1.783 Plavcan & Ruff 2008 18.33 14.33 64.37 3.00
## 6 1.703 Plavcan & Ruff 2008 17.94 12.32 66.14 3.64
## Activity_Budget_Ref
## 1
## 2
## 3 Campbell et al. 2007
## 4 Campbell et al. 2007
## 5 Campbell et al. 2007
## 6 Campbell et al. 2007
plot(data = d, WeaningAge_d ~ Body_mass_female_mean)
model <- lm(data = d, WeaningAge_d ~ Body_mass_female_mean)
summary(model)
##
## Call:
## lm(formula = WeaningAge_d ~ Body_mass_female_mean, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -720.39 -115.48 -54.05 58.11 471.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.016e+02 2.012e+01 10.02 <2e-16 ***
## Body_mass_female_mean 2.013e-02 1.927e-03 10.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 183.4 on 114 degrees of freedom
## (97 observations deleted due to missingness)
## Multiple R-squared: 0.489, Adjusted R-squared: 0.4845
## F-statistic: 109.1 on 1 and 114 DF, p-value: < 2.2e-16
plot(model)
qqPlot(model$residuals)
## 97 44
## 48 24
s <- shapiro.test(model$residuals)
s
##
## Shapiro-Wilk normality test
##
## data: model$residuals
## W = 0.86291, p-value = 5.825e-09
Return to the "KamilarAndCooper.csv" dataset you were looking at above and log transform both of your variables and then run a simple bivariate linear model. Do you notice a difference between these results and those obtained using untransformed variables?
d$logWeaningAge <- log(d$WeaningAge_d)
d$logFemaleBodyMass <- log(d$Body_mass_female_mean)
plot(data = d, logWeaningAge ~ logFemaleBodyMass)
model <- lm(data = d, logWeaningAge ~ logFemaleBodyMass)
summary(model)
##
## Call:
## lm(formula = logWeaningAge ~ logFemaleBodyMass, data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10639 -0.32736 0.00848 0.32214 1.11010
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.7590 0.2196 8.011 1.08e-12 ***
## logFemaleBodyMass 0.4721 0.0278 16.983 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4532 on 114 degrees of freedom
## (97 observations deleted due to missingness)
## Multiple R-squared: 0.7167, Adjusted R-squared: 0.7142
## F-statistic: 288.4 on 1 and 114 DF, p-value: < 2.2e-16
plot(model)
s <- shapiro.test(model$residuals)
s
##
## Shapiro-Wilk normality test
##
## data: model$residuals
## W = 0.99367, p-value = 0.8793
par(mfrow = c(1, 2))
a <- 2
b <- 2
# log x
x <- seq(from = 0, to = 100, length.out = 1000)
y <- a + b * log(x)
plot(x, y, type = "l", main = "untransformed")
plot(log(x), y, type = "l", main = "log(x)")
# log y
x <- seq(from = 0, to = 10, length.out = 1000)
y <- exp(a + b * x)
plot(x, y, type = "l", main = "untransformed")
plot(x, log(y), type = "l", main = "log(y)")
# assymptotic
x <- seq(from = 1, to = 100, length.out = 100)
y <- (a * x)/(1 + b * x)
plot(x, y, type = "l", main = "untransformed")
plot(1/x, y, type = "l", main = "1/x")
# reciprocal
x <- seq(from = 1, to = 100, length.out = 100)
y <- a + b/x
plot(x, y, type = "l", main = "untransformed")
plot(1/x, y, type = "l", main = "1/x")
# exp
x <- seq(from = 1, to = 10, length.out = 100)
y <- a * exp(b * x)
plot(x, y, type = "l", main = "untransformed")
plot(x, log(y), type = "l", main = "log(y)")